################################################################################
# 04_merge_event_data.R
# Merge ETS Treatment Data with IPEDS Data to Create Analysis Datasets
#
# Inputs:
#   - data/cleaned/ets_treatment_data.xlsx
#   - data/cleaned/ipeds_data_cleaned.xlsx
#   - data/raw/policy/state_treatment.dta (state economic + policy controls)
#
# Outputs:
#   - data/cleaned/enrollment_event_data.xlsx
#   - data/cleaned/graduation_event_data.xlsx
#
# Notes:
#   - Uses IPEDS data at native year convention (no year offset applied)
#   - ND and TN are KEPT in the output (24 states); downstream scripts
#     (06_main_regressions.R) exclude them for the main analysis
#   - Creates log outcome variables and event study interaction terms
#   - Adds selectivity classification based on 2010 SAT/ACT scores
#   - State economic/policy controls come from state_treatment.dta;
#     population variables are NA (downstream scripts hardcode shrinking states)
#
# IMPORTANT: The pre-cleaned enrollment/graduation files in data/cleaned/ have
#   complete control coverage from the original Stata pipeline. This script
#   regenerates them from IPEDS API data which has gaps (SAT/ACT ~25% NA,
#   no 2008 state controls). If the pre-cleaned files already exist, this script
#   will SKIP regeneration to preserve them. Delete them manually to force rebuild.
################################################################################

library(tidyverse)
library(readxl)
library(writexl)
library(haven)

cat("\n========================================\n")
cat("04: Merging ETS Treatment and IPEDS Data\n")
cat("========================================\n\n")

# Skip if pre-cleaned files already exist (they have complete control coverage)
if (file.exists("data/cleaned/enrollment_event_data.xlsx") &&
    file.exists("data/cleaned/graduation_event_data.xlsx")) {
  cat("  Pre-cleaned event data files already exist. Skipping merge.\n")
  cat("  (Delete data/cleaned/enrollment_event_data.xlsx and graduation_event_data.xlsx\n")
  cat("   to force regeneration from IPEDS API data.)\n\n")
  cat("04_merge_event_data.R complete (skipped).\n")
  return(invisible(NULL))
}

# Ensure output directory exists
if (!dir.exists("data/cleaned")) dir.create("data/cleaned", recursive = TRUE)

# ══════════════════════════════════════════════════════════════════════════════
# Constants
# ══════════════════════════════════════════════════════════════════════════════

# Sample states (22 in-sample + ND and TN for robustness tests)
SAMPLE_STATES <- c("AK", "AR", "CT", "DC", "DE", "HI", "LA", "MD", "ME",
                   "MS", "NC", "NE", "NH", "NJ", "NV", "OR", "PA", "SC",
                   "VA", "VT", "WI", "WV", "ND", "TN")

# Universities to drop (brute-force exclusions from original Stata code)
DROP_UNITIDS <- c(221856, 186371, 233602, 233912, 216524,
                  128902, 211440, 128780, 181428, 413723)

# ══════════════════════════════════════════════════════════════════════════════
# 1. Load Input Data
# ══════════════════════════════════════════════════════════════════════════════

cat("Loading input data...\n")

# --- ETS Treatment Data ---
ets <- read_excel("data/cleaned/ets_treatment_data.xlsx")
cat("  ETS treatment data:", nrow(ets), "rows,",
    n_distinct(ets$State), "states\n")

# --- IPEDS Data ---
ipeds <- read_excel("data/cleaned/ipeds_data_cleaned.xlsx") %>%
  filter(year >= 2008, year <= 2020)
cat("  IPEDS data:", nrow(ipeds), "rows,",
    n_distinct(ipeds$unitid), "universities\n")

# --- State-Level Controls (economic + policy) ---
state_controls_file <- "data/raw/policy/state_treatment.dta"
if (file.exists(state_controls_file)) {
  state_controls <- read_stata(state_controls_file) %>%
    select(State, year,
           real_income, unemployment_rate,
           passevals, implementevals, eliminate_tenure,
           increase_probationary_period, weaken_bargaining,
           eliminate_union_dues, won_race_top, common_core, edtpa) %>%
    distinct()
  cat("  State controls:", nrow(state_controls), "rows\n")
  has_state_controls <- TRUE
} else {
  cat("  WARNING: State controls file not found. Policy/economic controls will be NA.\n")
  has_state_controls <- FALSE
}

# ══════════════════════════════════════════════════════════════════════════════
# 2. Prepare IPEDS Data
# ══════════════════════════════════════════════════════════════════════════════

cat("\nPreparing IPEDS data...\n")

# Rename state variable and filter to sample states
ipeds <- ipeds %>%
  rename(State = state_abbr) %>%
  filter(State %in% SAMPLE_STATES)

cat("  After state filter:", nrow(ipeds), "observations,",
    n_distinct(ipeds$unitid), "universities\n")

# ── Rename completions columns ──────────────────────────────────────────────

ipeds <- ipeds %>%
  rename(
    ctotalt = ba_masters_teacher_preparation_completions_total,
    ctotalm = ba_masters_teacher_preparation_completions_male,
    ctotalw = ba_masters_teacher_preparation_completions_female,
    cwhitt  = ba_masters_teacher_preparation_completions_white,
    cbkaat  = ba_masters_teacher_preparation_completions_black,
    chispt  = ba_masters_teacher_preparation_completions_hispanic
  )

# Additional completion race/ethnicity vars (not in IPEDS R download; set to 0)
ipeds <- ipeds %>%
  mutate(
    c2mort  = 0L,
    cunknt  = 0L,
    casiat  = 0L,
    cbkaam  = 0L,
    cbkaaw  = 0L,
    chispm  = 0L,
    chispw  = 0L,
    cwhitm  = 0L,
    cwhitw  = 0L
  )

# ── Rename enrollment columns ───────────────────────────────────────────────

ipeds <- ipeds %>%
  rename(
    eftotlt = all_total_ed_enrollment,
    eftotlm = all_male_ed_enrollment,
    eftotlw = all_female_ed_enrollment,
    efwhitt = all_white_ed_enrollment,
    efbkaat_enrl = all_black_ed_enrollment,
    efhispt_enrl = all_hispanic_ed_enrollment
  ) %>%
  # Rename to avoid collision with completions race vars
  rename(
    efbkaat = efbkaat_enrl,
    efhispt = efhispt_enrl
  )

# Additional enrollment race variables (not in IPEDS R download; set to 0)
ipeds <- ipeds %>%
  mutate(
    efbkaam  = 0L,
    efbkaaw  = 0L,
    efhispm  = 0L,
    efhispw  = 0L,
    efwhitm  = 0L,
    efwhitw  = 0L,
    efaiant  = 0L,
    efasiat  = 0L,
    ef2mort  = 0L,
    efunknt  = 0L
  )

# ── Rename directory / institutional columns ────────────────────────────────

ipeds <- ipeds %>%
  rename(
    name      = inst_name,
    statefips = fips
  ) %>%
  mutate(
    city     = "",
    ccbasic  = NA_real_,
    carnegie = NA_real_,
    statename = statefips,   # Use fips name as statename placeholder
    state_name = statefips
  )

# ── Rename admissions columns ───────────────────────────────────────────────

ipeds <- ipeds %>%
  rename(
    satvr25 = sat_crit_read_25_pctl,
    satmt25 = sat_math_25_pctl,
    actcm25 = act_composite_25_pctl
  )

# ── Create university-level control variables with _2 suffix ────────────────
# These are contemporaneous values mapped to the column names expected downstream

ipeds <- ipeds %>%
  mutate(
    # Test optional: based on open admissions policy (string in IPEDS API data)
    test_optional = if_else(
      !is.na(open_admissions_policy) & open_admissions_policy == "Yes", 1L, 0L
    ),
    # SAT/ACT percentages
    satpct2 = sat_percent_submitting,
    actpct2 = act_percent_submitting,
    # SAT/ACT score percentiles
    satvr25_2 = satvr25,
    satvr75_2 = sat_crit_read_75_pctl,
    satmt25_2 = satmt25,
    satmt75_2 = sat_math_75_pctl,
    actcm25_2 = actcm25,
    actcm75_2 = act_composite_75_pctl,
    # Financial aid (IPEDS stores pell_percent as proportion; multiply by 100)
    pell_percent2 = pell_percent * 100,
    pell_amount2  = pell_average_amount,
    # Loan variables (not available from IPEDS API download)
    loan_percent2 = NA_real_,
    loan_average2 = NA_real_,
    # Total enrollment
    enrollment_total2 = enrollment_fall_fulltime_firsttime_undergrad
  )

# ── Population / demographic placeholders ───────────────────────────────────
# These are not available from IPEDS API; set as NA
# (downstream scripts hardcode SHRINKING_STATES instead of using these)

ipeds <- ipeds %>%
  mutate(
    cohort_5_9  = NA_real_,
    cohort_10_14 = NA_real_,
    cohort_15_17 = NA_real_,
    population   = NA_real_,
    pop_5_17     = NA_real_,
    percentage_change_08_18 = NA_real_,
    pop_percentage_change   = NA_real_
  )

cat("  IPEDS variables prepared:", nrow(ipeds), "observations\n")

# ══════════════════════════════════════════════════════════════════════════════
# 3. Merge State-Level Controls
# ══════════════════════════════════════════════════════════════════════════════

if (has_state_controls) {
  cat("  Merging state-level controls...\n")
  ipeds <- ipeds %>%
    left_join(state_controls, by = c("State", "year"))
  cat("  After state controls merge:", nrow(ipeds), "observations\n")
} else {
  # Create placeholder columns
  ipeds <- ipeds %>%
    mutate(
      real_income = NA_real_,
      unemployment_rate = NA_real_,
      passevals = NA_real_,
      implementevals = NA_real_,
      eliminate_tenure = NA_real_,
      increase_probationary_period = NA_real_,
      weaken_bargaining = NA_real_,
      eliminate_union_dues = NA_real_,
      won_race_top = NA_real_,
      common_core = NA_real_,
      edtpa = NA_real_
    )
}

# ══════════════════════════════════════════════════════════════════════════════
# 4. Create Enrollment Event Data
# ══════════════════════════════════════════════════════════════════════════════

cat("\n========================================\n")
cat("Creating Enrollment Event Data\n")
cat("========================================\n")

# ── Merge IPEDS with ETS treatment data (many:1 on State, year) ─────────────
# No year offset: IPEDS data used at its native year convention.
# Completions and enrollment are both from the same IPEDS year row.

enrollment <- ipeds %>%
  left_join(ets, by = c("State", "year"))

cat("  After ETS merge:", nrow(enrollment), "observations\n")

# ── Sort and apply sample restrictions ──────────────────────────────────────

enrollment <- enrollment %>%
  arrange(unitid, year)

# Drop universities with zero total completions across ALL years
enrollment <- enrollment %>%
  group_by(unitid) %>%
  mutate(univ_grads_all = sum(ctotalt, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(univ_grads_all > 0) %>%
  select(-univ_grads_all)

# Drop universities with zero completions in even years
enrollment <- enrollment %>%
  group_by(unitid) %>%
  mutate(univ_grads_even = sum(
    if_else(year %in% c(2008, 2010, 2012, 2014, 2016, 2018, 2020), ctotalt, 0),
    na.rm = TRUE
  )) %>%
  ungroup() %>%
  filter(univ_grads_even > 0) %>%
  select(-univ_grads_even)

# Drop universities with zero completions in interior years (not 2008, not 2020)
enrollment <- enrollment %>%
  group_by(unitid) %>%
  mutate(univ_grads_interior = sum(
    if_else(!year %in% c(2008, 2020), ctotalt, 0),
    na.rm = TRUE
  )) %>%
  ungroup() %>%
  filter(univ_grads_interior > 0) %>%
  select(-univ_grads_interior)

# Keep only even years (biennial enrollment panel) and drop 2020
enrollment <- enrollment %>%
  filter(year %in% c(2008, 2010, 2012, 2014, 2016, 2018))

# Drop universities with zero enrollment across all years
enrollment <- enrollment %>%
  group_by(unitid) %>%
  mutate(univ_enroll = sum(eftotlt, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(univ_enroll > 0) %>%
  select(-univ_enroll)

# Replace missing enrollment race variables when enrollment is zero
enrollment <- enrollment %>%
  mutate(
    efbkaat = if_else(is.na(eftotlt) | eftotlt == 0, 0, efbkaat),
    efwhitt = if_else(is.na(eftotlt) | eftotlt == 0, 0, efwhitt),
    efhispt = if_else(is.na(eftotlt) | eftotlt == 0, 0, efhispt)
  )

# Drop brute-force exclusion unitids
enrollment <- enrollment %>%
  filter(!unitid %in% DROP_UNITIDS)

cat("  After sample restrictions:", nrow(enrollment), "observations,",
    n_distinct(enrollment$unitid), "universities\n")

# ── Create log outcome variables ────────────────────────────────────────────

enrollment <- enrollment %>%
  mutate(
    l_eftotlt     = log(eftotlt + 1),
    l_efbkaat     = log(efbkaat + 1),
    l_efwhitt     = log(efwhitt + 1),
    l_efhispt     = log(efhispt + 1),
    non_white     = eftotlt - efwhitt,
    l_nonwhite    = log(pmax(non_white, 0) + 1),
    black_hispanic = efbkaat + efhispt,
    l_blackhipanic = log(black_hispanic + 1),
    l_efbkaat2    = log(efbkaat)
  )

# ── Create event study interaction variables (biennial) ─────────────────────
# year_XXXX = DeltaTDI * I(year == XXXX), with 2012 as reference

# Drop any year_XXXX vars inherited from ETS merge
enrollment <- enrollment %>%
  select(-starts_with("year_"))

enrollment <- enrollment %>%
  mutate(
    year_2008 = if_else(time_till == -5, continuous_treat, 0),
    year_2010 = if_else(time_till == -3, continuous_treat, 0),
    year_2012 = 0,  # Reference year
    year_2014 = if_else(time_till ==  1, continuous_treat, 0),
    year_2016 = if_else(time_till ==  3, continuous_treat, 0),
    year_2018 = if_else(time_till ==  5, continuous_treat, 0)
  )

# PA and SC 2018 indicators
enrollment <- enrollment %>%
  mutate(
    pa2018 = as.integer(State == "PA" & year == 2018),
    sc2018 = as.integer(State == "SC" & year == 2018)
  )

# Replace NAs in event study vars with 0
enrollment <- enrollment %>%
  mutate(across(starts_with("year_"), ~ replace_na(.x, 0)))

# ── Select and order columns to match expected output ───────────────────────

enrollment_out <- enrollment %>%
  select(
    # Core identifiers
    unitid, year,
    # Completions
    ctotalt, ctotalm, ctotalw, casiat, cbkaat, cbkaam, cbkaaw,
    chispt, chispm, chispw, cwhitt, cwhitm, cwhitw,
    # Directory
    name, city, State, ccbasic, carnegie, statefips, statename,
    # State economic controls
    real_income, state_name, unemployment_rate,
    # Population (placeholders)
    cohort_5_9, cohort_10_14, cohort_15_17, population, pop_5_17,
    percentage_change_08_18, pop_percentage_change,
    # State policy controls
    passevals, implementevals, eliminate_tenure, increase_probationary_period,
    weaken_bargaining, eliminate_union_dues, won_race_top, common_core, edtpa,
    # Admissions
    satvr25, satmt25, actcm25, test_optional,
    # University controls (_2 suffix)
    satpct2, actpct2, satvr25_2, satvr75_2, satmt25_2, satmt75_2,
    actcm25_2, actcm75_2, pell_percent2, pell_amount2,
    loan_percent2, loan_average2, enrollment_total2,
    # ETS treatment variables
    any_of(c("ID", "passingscore", "test", "subject", "time", "test_name",
             "test_mean", "test_sd", "z_score", "passingscore_composite",
             "z_score_composite")),
    any_of("state"),
    test_index, any_of("test_index_noncomposite"),
    test_index_lead1, any_of("treat"),
    any_of("continuous_treatment_amount"), continuous_treat,
    treatment_year, time_till,
    # Lead/lag variables from ETS
    any_of(paste0("lead", 1:5)), any_of(paste0("lag", 0:7)),
    # Enrollment variables
    eftotlt, eftotlm, eftotlw, efbkaat, efbkaam, efbkaaw,
    efhispt, efhispm, efhispw, efwhitt, efwhitm, efwhitw,
    efaiant, efasiat, ef2mort, efunknt,
    # Log outcome variables
    l_eftotlt, l_efbkaat, l_efwhitt, l_efhispt,
    non_white, l_nonwhite, black_hispanic, l_blackhipanic, l_efbkaat2,
    # Event study interaction variables
    year_2008, year_2010, year_2012, year_2014, year_2016, year_2018,
    # Indicator variables
    pa2018, sc2018,
    # Catch remaining
    everything()
  )

write_xlsx(enrollment_out, "data/cleaned/enrollment_event_data.xlsx")

cat("  Saved: data/cleaned/enrollment_event_data.xlsx\n")
cat("  Rows:", nrow(enrollment_out), "\n")
cat("  Universities:", n_distinct(enrollment_out$unitid), "\n")
cat("  States:", n_distinct(enrollment_out$State), "\n")
cat("  Years:", paste(sort(unique(enrollment_out$year)), collapse = ", "), "\n")

# ══════════════════════════════════════════════════════════════════════════════
# 5. Create Graduation Event Data
# ══════════════════════════════════════════════════════════════════════════════

cat("\n========================================\n")
cat("Creating Graduation Event Data\n")
cat("========================================\n")

# ── Merge IPEDS with ETS treatment data (many:1 on State, year) ─────────────
# No year offset: IPEDS data used at its native year convention.

graduation <- ipeds %>%
  left_join(ets, by = c("State", "year"))

cat("  After ETS merge:", nrow(graduation), "observations\n")

# ── Sort and apply sample restrictions ──────────────────────────────────────

graduation <- graduation %>%
  arrange(unitid, year)

# Drop universities with zero total completions across ALL years
graduation <- graduation %>%
  group_by(unitid) %>%
  mutate(univ_grads_all = sum(ctotalt, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(univ_grads_all > 0) %>%
  select(-univ_grads_all)

# Drop universities with zero completions in even years
graduation <- graduation %>%
  group_by(unitid) %>%
  mutate(univ_grads_even = sum(
    if_else(year %in% c(2008, 2010, 2012, 2014, 2016, 2018, 2020), ctotalt, 0),
    na.rm = TRUE
  )) %>%
  ungroup() %>%
  filter(univ_grads_even > 0) %>%
  select(-univ_grads_even)

# Drop universities with zero completions in interior years
graduation <- graduation %>%
  group_by(unitid) %>%
  mutate(univ_grads_interior = sum(
    if_else(!year %in% c(2008, 2020), ctotalt, 0),
    na.rm = TRUE
  )) %>%
  ungroup() %>%
  filter(univ_grads_interior > 0) %>%
  select(-univ_grads_interior)

# Drop universities with zero enrollment across all years
graduation <- graduation %>%
  group_by(unitid) %>%
  mutate(univ_enroll = sum(eftotlt, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(univ_enroll > 0) %>%
  select(-univ_enroll)

# Drop brute-force exclusion unitids
graduation <- graduation %>%
  filter(!unitid %in% DROP_UNITIDS)

cat("  After sample restrictions:", nrow(graduation), "observations,",
    n_distinct(graduation$unitid), "universities\n")

# ── Create log outcome variables ────────────────────────────────────────────

graduation <- graduation %>%
  mutate(
    l_ctotalt   = log(ctotalt + 1),
    l_cbkaat    = log(cbkaat + 1),
    l_chispt    = log(chispt + 1),
    l_cwhitt    = log(cwhitt + 1),
    l_c2mort    = log(c2mort + 1),
    l_cunknt    = log(cunknt + 1),
    cnonwhite   = ctotalt - cwhitt,
    l_cnonwhite = log(cnonwhite + 1),
    l_male      = log(ctotalm + 1),
    l_female    = log(ctotalw + 1)
  )

# ── Create selectivity classification ───────────────────────────────────────
# Based on 2010 SAT verbal 25th pctl or ACT composite 25th pctl

graduation <- graduation %>%
  arrange(unitid, year)

# Carry forward 2008 SAT/ACT values
graduation <- graduation %>%
  group_by(unitid) %>%
  mutate(
    satv_2008 = if_else(year == 2008, satvr25, NA_real_),
    satm_2008 = if_else(year == 2008, satmt25, NA_real_),
    act_2008  = if_else(year == 2008, actcm25, NA_real_)
  ) %>%
  fill(satv_2008, satm_2008, act_2008, .direction = "down") %>%
  ungroup() %>%
  mutate(
    satv_2008 = replace_na(satv_2008, 0),
    act_2008  = replace_na(act_2008, 0)
  )

# 2010 SAT/ACT for selectivity cutoff (carry both forward and backward)
graduation <- graduation %>%
  group_by(unitid) %>%
  mutate(
    satv_2010 = if_else(year == 2010, satvr25, NA_real_),
    act_2010  = if_else(year == 2010, actcm25, NA_real_)
  ) %>%
  fill(satv_2010, .direction = "downup") %>%
  fill(act_2010, .direction = "downup") %>%
  ungroup() %>%
  mutate(
    satv_2010 = replace_na(satv_2010, 0)
  )

# Selective: SAT verbal 25th > 430, or ACT composite >= 19 if no SAT
graduation <- graduation %>%
  mutate(
    selective = case_when(
      satv_2010 > 430 ~ 1L,
      act_2010 >= 19 & satv_2010 == 0 & !is.na(act_2010) ~ 1L,
      TRUE ~ 0L
    )
  )

# ── Drop 2008 (graduation data starts at 2009) ─────────────────────────────

graduation <- graduation %>%
  filter(year != 2008)

# ── Create event study interaction variables (annual) ───────────────────────
# Rename lead/lag from ETS merge to year_XXXX format

# Drop any year_XXXX vars inherited from ETS merge
graduation <- graduation %>%
  select(-starts_with("year_"))

# Rename lead/lag columns to year_XXXX
# lead5 -> year_2008, lead4 -> year_2009, ..., lag7 -> year_2020
graduation <- graduation %>%
  rename(
    year_2008 = lead5,
    year_2009 = lead4,
    year_2010 = lead3,
    year_2011 = lead2,
    year_2012 = lead1,
    year_2013 = lag0,
    year_2014 = lag1,
    year_2015 = lag2,
    year_2016 = lag3,
    year_2017 = lag4,
    year_2018 = lag5,
    year_2019 = lag6,
    year_2020 = lag7
  )

# Reference year: 2012
graduation <- graduation %>%
  mutate(year_2012 = 0)

# Replace NAs in event study vars with 0
graduation <- graduation %>%
  mutate(across(starts_with("year_"), ~ replace_na(.x, 0)))

# ── Select and order columns to match expected output ───────────────────────

graduation_out <- graduation %>%
  select(
    # Core identifiers
    unitid, year,
    # Completions
    ctotalt, ctotalm, ctotalw, cbkaat, chispt, cwhitt, c2mort, cunknt, casiat,
    # Directory
    name, city, State, ccbasic, carnegie, statefips, statename,
    # State economic controls
    real_income, state_name, unemployment_rate,
    # Population (placeholders)
    cohort_5_9, cohort_10_14, cohort_15_17, population, pop_5_17,
    percentage_change_08_18, pop_percentage_change,
    # State policy controls
    passevals, implementevals, eliminate_tenure, increase_probationary_period,
    weaken_bargaining, eliminate_union_dues, won_race_top, common_core, edtpa,
    # Admissions
    satvr25, satmt25, actcm25, test_optional,
    # University controls (_2 suffix)
    satpct2, actpct2, satvr25_2, satvr75_2, satmt25_2, satmt75_2,
    actcm25_2, actcm75_2, pell_percent2, pell_amount2,
    loan_percent2, loan_average2, enrollment_total2,
    # ETS treatment variables
    any_of(c("ID", "passingscore", "test", "subject", "time", "test_name",
             "test_mean", "test_sd", "z_score", "passingscore_composite",
             "z_score_composite")),
    any_of("state"),
    test_index, any_of("test_index_noncomposite"),
    test_index_lead1, any_of("treat"),
    any_of("continuous_treatment_amount"), continuous_treat,
    treatment_year, time_till,
    # Event study interaction variables (annual)
    year_2008, year_2009, year_2010, year_2011, year_2012,
    year_2013, year_2014, year_2015, year_2016, year_2017,
    year_2018, year_2019, year_2020,
    # Log outcome variables
    l_ctotalt, l_cbkaat, l_chispt, l_cwhitt, l_c2mort, l_cunknt,
    cnonwhite, l_cnonwhite, l_male, l_female,
    # Selectivity variables
    satv_2008, satm_2008, act_2008, satv_2010, act_2010, selective,
    # Catch remaining
    everything()
  )

# Sort by unitid and year (descending within unitid to match Stata output)
graduation_out <- graduation_out %>%
  arrange(unitid, desc(year))

write_xlsx(graduation_out, "data/cleaned/graduation_event_data.xlsx")

cat("  Saved: data/cleaned/graduation_event_data.xlsx\n")
cat("  Rows:", nrow(graduation_out), "\n")
cat("  Universities:", n_distinct(graduation_out$unitid), "\n")
cat("  States:", n_distinct(graduation_out$State), "\n")
cat("  Years:", paste(sort(unique(graduation_out$year)), collapse = ", "), "\n")

# ══════════════════════════════════════════════════════════════════════════════
# Summary
# ══════════════════════════════════════════════════════════════════════════════

cat("\n========================================\n")
cat("Merge Summary\n")
cat("========================================\n")
cat("Created:\n")
cat("  - data/cleaned/enrollment_event_data.xlsx\n")
cat(sprintf("    %d obs, %d universities, %d states\n",
    nrow(enrollment_out), n_distinct(enrollment_out$unitid),
    n_distinct(enrollment_out$State)))
cat("  - data/cleaned/graduation_event_data.xlsx\n")
cat(sprintf("    %d obs, %d universities, %d states\n",
    nrow(graduation_out), n_distinct(graduation_out$unitid),
    n_distinct(graduation_out$State)))
cat("\n04_merge_event_data.R complete.\n")
